# Kernel: hconv_bprop_C64_N64

# Copyright 2014 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


<CONSTANT_MAPPING>
    addr_zero  : 4x<64*8*4 + 0>
    addr_m     : 4x<64*8*4 + 4>
    addr_p     : 4x<64*8*4 + 5>
    addr_q     : 4x<64*8*4 + 6>
    addr_szLut : 4x<64*8*4 + 7>
    addr_lut   : 4x<64*8*4 + 8>

    param_O[0]         : c[0x0][0x140]
    param_O[1]         : c[0x0][0x144]
    param_I[0]         : c[0x0][0x148]
    param_I[1]         : c[0x0][0x14c]
    param_F[0]         : c[0x0][0x150]
    param_F[1]         : c[0x0][0x154]
    param_alpha        : c[0x0][0x158]
    param_flags        : c[0x0][0x15c]
    param_N            : c[0x0][0x160]
    param_K            : c[0x0][0x164]
    param_D            : c[0x0][0x168]
    param_H            : c[0x0][0x16c]
    param_W            : c[0x0][0x170]
    param_WN           : c[0x0][0x174]
    param_HWN          : c[0x0][0x178]
    param_DHWN         : c[0x0][0x17c]
    param_C            : c[0x0][0x180]
    param_KRST         : c[0x0][0x184]
    param_RST          : c[0x0][0x188]
    param_RS           : c[0x0][0x18c]
    param_magic_RS     : c[0x0][0x190]
    param_shift_RS     : c[0x0][0x194]
    param_S            : c[0x0][0x198]
    param_magic_S      : c[0x0][0x19c]
    param_shift_S      : c[0x0][0x1a0]
    param_pad_d        : c[0x0][0x1a4]
    param_pad_h        : c[0x0][0x1a8]
    param_pad_w        : c[0x0][0x1ac]
    param_str_d        : c[0x0][0x1b0]
    param_str_h        : c[0x0][0x1b4]
    param_str_w        : c[0x0][0x1b8]
    param_Q            : c[0x0][0x1bc]
    param_PQ           : c[0x0][0x1c0]
    param_QN           : c[0x0][0x1c4]
    param_PQN          : c[0x0][0x1c8]
    param_MPQN         : c[0x0][0x1cc]
    param_magic_Q      : c[0x0][0x1d0]
    param_shift_Q      : c[0x0][0x1d4]
    param_magic_PQ     : c[0x0][0x1d8]
    param_shift_PQ     : c[0x0][0x1dc]
    param_R            : c[0x0][0x1e0]
    param_T            : c[0x0][0x1e4]
    param_magic_str_w  : c[0x0][0x1e8]
    param_shift_str_w  : c[0x0][0x1ec]
    param_magic_str_h  : c[0x0][0x1f0]
    param_shift_str_h  : c[0x0][0x1f4]
    param_magic_str_d  : c[0x0][0x1f8]
    param_shift_str_d  : c[0x0][0x1fc]
</CONSTANT_MAPPING>


<REGISTER_MAPPING>

      64-67 : mpq<0-3>
      64-67 : m, p, q, tidY
      68-71 ~ tid, blkF, blkI, blkMPQ
      72-95 ~ tid1, tidX
     72-107 ~ str_d, str_h, str_w, pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, one, rst_prime, x_prime, y_prime, z_prime, ballot, warp_slices, partial, endCRST

    0-63    : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

      64-79 : j0Ix<0-7>, j0Fy<0-7>
      80-95 : j1Ix<0-7>, j1Fy<0-7>

      96-99 : trackI<0-1>, trackF<0-1>

    100-103 : loadI<0-3>
    100-103 : storeI<0-3>
    104-107 : storeI<4-7>

    108-111 : loadF<0-3>
    108-111 : storeF<0-3>
    104-107 : storeF<4-7>

    104-107 ~ offsetF

    112-113 : sliceI, sliceF
    112-113 : sliceIF<0-1>

    114-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
    126-127 ~ readFs, readIs

    72-83   : c<0-7>, cs<0-3>
    84-85   : Out<0-1>
    86-125  ~ writeCs, readCs, alpha, tidOX, tidOY, to, k, n, MPQN1

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,    SR_TID.X;
--:-:2:-:1      S2R blkF,   SR_CTAID.Y;
--:-:3:-:1      S2R blkI,   SR_CTAID.Z;
--:-:4:-:1      S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index

<SCHEDULE_BLOCK>
01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;

--:-:-:-:1      STS.128 [addr_zero], RZ;
<CODE>
    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
</CODE>

// tidX = (tid & 7) << 3
// tidY = tid >> 3
--:-:-:-:1      LOP.AND tidX, tid,  7;
--:-:-:-:1      SHL     tidX, tidX, 3;
--:-:-:-:1      SHR.U32 tidY, tid,  3;

// trackF += blkF*64 + tidX
02:-:-:-:1      ISCADD  offsetFk, blkF, tidX, 6;

// trackI += blkI*64 + tidX
04:-:-:-:1      ISCADD  offsetIn, blkI, tidX, 6;

// Remap the X dim to avoid bank conflicts when storing to shared
// We can unmap this in the output
--:-:-:-:1      SHR.U32 tidX, tidX, 1;

// writeS = (64*tidY + tidX) * 4
--:-:-:-:1      ISCADD  writeS, tidY, tidX, 6;
--:-:-:-:1      SHL     writeS, writeS, 2;

// readFs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4;
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readFs, tid,    0x30;
--:-:-:-:1      SHR.U32 readFs, readFs, 3;
--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
--:-:-:-:0      SHL     readFs, readFs, 4;

// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      ISCADD  readIs, readIs, 4x<8*64>, 4;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U END_SETUP;

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV str_d,      param_str_d;
--:-:-:-:1      MOV str_h,      param_str_h;
--:-:-:-:1      MOV str_w,      param_str_w;
--:-:-:-:1      MOV rst,        tid;
--:-:-:-:1      MOV lutStore2,  RZ;
--:-:-:-:1      MOV lutSize,    RZ;
--:-:-:-:1      MOV warp_count, 32;

// m  = blkMPQ / PQ
// pq = blkMPQ % PQ
08:-:-:-:1      XMAD.LO2C m, blkMPQ, param_magic_PQ, RZ;
--:-:-:-:1      SHR.U32   m, m,      param_shift_PQ;
--:-:-:-:1      XMAD pq,  m, param_PQ, RZ;
--:-:-:-:1      IADD pq, -pq, blkMPQ;
// p = pq / Q
// q = pq % Q
--:-:-:-:1      XMAD.LO2C p, pq, param_magic_Q, RZ;
--:-:-:-:1      SHR.U32   p, p,  param_shift_Q;
--:-:-:-:1      XMAD  q,  p, param_Q, RZ;
--:-:-:-:1      IADD  q, -q, pq;

--:-:-:-:1      MOV dep_thd_mask, -1;

--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
--:-:-:-:1  @P1 IADD3 q, -q, param_Q, dep_thd_mask;

--:-:-:-:1      STS.128 [addr_m], m;

// qs = q - S + pad_w + 1
// pr = p - R + pad_h + 1
// mt = m - T + pad_d + 1
--:-:-:-:1      MOV   one, 1;
--:-:-:-:1      IADD  qs, q, -param_S;
--:-:-:-:1      IADD3 qs, qs, param_pad_w, one;
--:-:-:-:1      IADD  pr, p, -param_R;
--:-:-:-:1      IADD3 pr, pr, param_pad_h, one;
--:-:-:-:1      IADD  mt, m, -param_T;
--:-:-:-:1      IADD3 mt, mt, param_pad_d, one;

--:-:-:-:1      IADD    mask_shr, -tid, 32;
--:-:-:-:1      SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr;
</SCHEDULE_BLOCK>

LUT_LOOP:

<SCHEDULE_BLOCK>
// warp synchronous loop while warp_count < RST
--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
--:-:-:-:1      IADD warp_count, warp_count, 32;
// t =  rst / RS
// rs = rst % RS
--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
--:-:-:-:1      IADD  rs, -rs, rst;
// r = rs / S
// s = rs % S
--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
--:-:-:-:1      SHR.U32   r, r, param_shift_S;
--:-:-:-:1      XMAD   s, r, param_S, RZ;
--:-:-:-:1      IADD   s, -s, rs;
// x = qs + s
// y = pr + r
// z = mt + t
--:-:-:-:1      IADD x, qs, s;
--:-:-:-:1      IADD y, pr, r;
--:-:-:-:1      IADD z, mt, t;
--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, PT;
// s = S - s - 1
// r = R - r - 1
// t = T - t - 1
// rst_prime = t*RS + r*S + s
--:-:-:-:1      IADD3 s, -s, param_S, -one;
--:-:-:-:1      IADD3 r, -r, param_R, -one;
--:-:-:-:1      IADD3 t, -t, param_T, -one;
--:-:-:-:1      XMAD  rst_prime, r, param_S,  s;
--:-:-:-:1      XMAD  rst_prime, t, param_RS, rst_prime;
// x_prime = x / str_w
// x       = x % str_w
--:-:-:-:1      XMAD    x_prime, x, param_magic_str_w, RZ;
--:-:-:-:1      SHR.U32 x_prime, x_prime, param_shift_str_w;
--:-:-:-:1      VMAD.U16.U16 x, -x_prime, str_w, x;
// y_prime = y / str_h
// y       = y % str_h
--:-:-:-:1      XMAD    y_prime, y, param_magic_str_h, RZ;
--:-:-:-:1      SHR.U32 y_prime, y_prime, param_shift_str_h;
--:-:-:-:1      VMAD.U16.U16 y, -y_prime, str_h, y;
// z_prime = z / str_d
// z       = z % str_d
--:-:-:-:1      XMAD    z_prime, z, param_magic_str_d, RZ;
--:-:-:-:1      SHR.U32 z_prime, z_prime, param_shift_str_d;
--:-:-:-:1      VMAD.U16.U16 z, -z_prime, str_d, z;

--:-:-:-:1      ISETP.EQ.AND  P4, PT, x, RZ, P4;
--:-:-:-:1      ISETP.EQ.AND  P5, PT, y, RZ, P5;
--:-:-:-:1      ISETP.EQ.AND  P6, PT, z, RZ, P6;
--:-:-:-:1      ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
--:-:-:-:1      ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
--:-:-:-:1      ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;

// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
01:-:-:-:1      XMAD      sliceI, x_prime, param_N,   RZ;
--:-:-:-:1      XMAD.LO2C sliceI, y_prime, param_WN,  sliceI;
--:-:-:-:1      XMAD.LO2C sliceI, z_prime, param_HWN, sliceI;
// sliceF = rst_prime * K
01:-:-:-:1      XMAD sliceF, rst_prime, param_K, RZ;

<ORDERED>
// Get a mask of all valid slices in the warp
--:-:-:-:1      VOTE.ANY ballot, PT, P1;
// Count the total valid slices
--:-:2:-:1      POPC warp_slices, ballot;
// Prepare lutStore for this and next loop
--:-:-:-:1  @P1 MOV    lutStore, lutStore2;
02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
// Count all the valid slices below this threadid
--:-:-:-:1  @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
--:-:3:-:1  @P1 POPC dep_thd_cnt, dep_thd_bits;
// use the rst increment to space the barrier sync
--:-:-:-:1      IADD rst, rst, 32;
// Update the lutStore address from this count
04:-:-:-:1  @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
// Store both slice offsets in the lut
--:1:-:-:1  @P1 STS.64 [lutStore + addr_lut], sliceIF;
</ORDERED>
// Keep track of the total size of the lut
--:-:-:-:1      IADD lutSize, lutSize, warp_slices;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U LUT_LOOP;

// Share the lut size with the other warp
--:1:-:-:2      STS [addr_szLut], lutSize;

END_SETUP:

09:-:-:-:5      BAR.SYNC 0;

// Grab the caclulated lut size and get it's reciprical
// Get the total reduction depth
--:-:1:-:2      LDS lutSize, [addr_szLut];
01:-:-:-:0      XMAD endCRST, lutSize, param_C, RZ;
--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;

<SCHEDULE_BLOCK>
// posCRST = endCRST - tidY - 1
--:-:-:-:1      IADD3 posCRST, endCRST, -1, -tidY;
// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
// If it is a multiple of 8 then make a full 8 line fetch.
--:-:-:-:1      LOP.AND.Z P1, partial, endCRST, 7;
--:-:-:-:1  @P1 MOV partial, 8;
// channel = posCRST / lutSize
// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
03:-:-:-:1      FMUL channel, posCRSTf, lutSizeRcp;
--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
// lutOffset = (posCRST % lutSize) * 8
02:-:-:-:1      VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
--:-:-:-:1      SHL lutOffset, lutOffset, 3;
// P1 = tidY < partial
--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partial, PT;
// offsetIC = channel * DHWN
// offsetFC = channel * K
--:-:-:-:1      XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
--:-:-:-:1      XMAD      offsetFc, channel, param_KRST, RZ;
// posCRST -= partial
--:-:-:-:1      IADD posCRST, posCRST, -partial;
--:-:1:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
</SCHEDULE_BLOCK>

// trackI = offsetIN + offsetIC + sliceI + param_I
// trackF = offsetFK + offsetFC + sliceF + param_F
01:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     1;
--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;
--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     1;
--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;

--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
--:-:5:-:1 @!P1 LDS.U.128    loadF0, [addr_zero];

--:-:2:-:1  @P1 LDG.E.128 loadI0, [trackI];
--:-:6:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];

--:-:-:-:0      ISETP.GE.AND P1, PT, posCRST, RZ, PT;

11:-:-:-:1      F2F.F32.F16 storeF7, loadF3.H1;
--:-:-:-:1      F2F.F32.F16 storeF6, loadF3.H0;
--:-:-:-:1      F2F.F32.F16 storeF5, loadF2.H1;
--:-:1:-:1      F2F.F32.F16 storeF4, loadF2.H0;
--:-:-:-:1      F2F.F32.F16 storeF3, loadF1.H1;
--:-:-:-:1      F2F.F32.F16 storeF2, loadF1.H0;
--:-:-:-:1      F2F.F32.F16 storeF1, loadF0.H1;
--:-:5:-:1      F2F.F32.F16 storeF0, loadF0.H0;

01:1:-:-:1      STS.128 [writeS + 4x<0*64 + 32>], storeF4;
10:-:-:-:1      STS.128 [writeS + 4x<0*64 +  0>], storeF0;

23:-:-:-:1      F2F.F32.F16 storeI7, loadI3.H1;
--:-:-:-:1      F2F.F32.F16 storeI6, loadI3.H0;
--:-:-:-:1      F2F.F32.F16 storeI5, loadI2.H1;
--:-:1:-:1      F2F.F32.F16 storeI4, loadI2.H0;
--:-:-:-:1      F2F.F32.F16 storeI3, loadI1.H1;
--:-:-:-:1      F2F.F32.F16 storeI2, loadI1.H0;
--:-:-:-:1      F2F.F32.F16 storeI1, loadI0.H1;
--:-:5:-:1      F2F.F32.F16 storeI0, loadI0.H0;

01:-:-:-:1      STS.128 [writeS + 4x<8*64 + 32>], storeI4;
10:1:-:-:1      STS.128 [writeS + 4x<8*64 +  0>], storeI0;

--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;

--:-:-:-:5      BAR.SYNC 0;
01:-:-:-:0      LOP.XOR writeS, writeS, 4x<64*8*2>;

--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*64 + 00>];
--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*64 + 00>];
--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*64 + 32>];
--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*64 + 32>];

<SCHEDULE_BLOCK>

// channel = posCRST / lutSize
02:-:-:-:1  @P1 FMUL channel, posCRSTf, lutSizeRcp;
--:-:-:-:1  @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
--:-:2:-:1  @P1 F2I.S32.F32.TRUNC channel, channel;
// lutOffset = (posCRST % lutSize) * 8
02:-:-:-:1  @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
--:-:-:-:1  @P1 SHL lutOffset, lutOffset, 3;
// offsetIC = channel * DHWN
// offsetFC = channel * K
--:-:-:-:1  @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
--:-:-:-:1  @P1 XMAD      offsetFc, channel, param_KRST, RZ;

--:-:-:-:1      IADD posCRST, posCRST, -8;
--:-:2:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
</SCHEDULE_BLOCK>

// trackI = offsetIN + offsetIC + sliceI + param_I
// trackF = offsetFK + offsetFC + sliceF + param_F
02:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     1;
--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;
--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     1;
--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;
--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
--:-:3:-:1  @P1 LDG.E.128    loadI0, [trackI + 4x< 0>];

LOOP:

<CODE>
    my %insert =
    (
        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",

        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",

        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",

        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",

        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",

        j1c20 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
        j1c25 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
        j1c31 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
        j1c32 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",

        j1c18 => "02:-:-:-:1  \@P0 F2F.F32.F16 storeF7, loadF3.H1;\n",
        j1c22 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeF6, loadF3.H0;\n",
        j1c26 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeF5, loadF2.H1;\n",
        j1c30 => "--:-:5:-:1  \@P0 F2F.F32.F16 storeF4, loadF2.H0;\n",
        j1c33 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeF3, loadF1.H1;\n",
        j1c37 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeF2, loadF1.H0;\n",
        j1c41 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeF1, loadF0.H1;\n",
        j1c45 => "--:-:2:-:1  \@P0 F2F.F32.F16 storeF0, loadF0.H0;\n",

        j1c47 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 + 32>], storeF4;\n",
        j1c62 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 +  0>], storeF0;\n",

        j2c19 => "30:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
        j2c24 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
        j2c26 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
        j2c28 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",

        j2c30 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",

        j5c29 => "04:-:-:-:1  \@P0 F2F.F32.F16 storeI7, loadI3.H1;\n",
        j5c33 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeI6, loadI3.H0;\n",
        j5c37 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeI5, loadI2.H1;\n",
        j5c41 => "--:-:5:-:1  \@P0 F2F.F32.F16 storeI4, loadI2.H0;\n",
        j5c45 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeI3, loadI1.H1;\n",
        j5c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeI2, loadI1.H0;\n",
        j5c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 storeI1, loadI0.H1;\n",
        j5c57 => "--:-:3:-:1  \@P0 F2F.F32.F16 storeI0, loadI0.H0;\n",

        j5c59 => "10:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 + 32>], storeI4;\n",
        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 +  0>], storeI0;\n",

        j6c50 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
        j6c55 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",

        j6c61 => "04:-:3:-:1  \@P1 LDG.E.128 loadI0, [trackI];\n",

        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",


        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
    );

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }

    my $out;
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;

</CODE>

--:-:1:-:1      LDS.U.128 mpq, [addr_m];
--:-:2:-:1      S2R tid,  SR_TID.X;
--:-:3:-:1      S2R blkI, SR_CTAID.Z;
--:-:4:-:1      S2R blkF, SR_CTAID.Y;

<SCHEDULE_BLOCK>

// tidOX = (tid & 7) << 3
// tidOY = tid >> 3
02:-:-:-:1      LOP.AND tidOX, tid,   7;
--:-:-:-:1      SHL     tidOX, tidOX, 3;
--:-:-:-:1      SHR.U32 tidOY, tid,   3;

--:-:-:-:1      LOP.AND readFs, readFs, 0x7ff;
--:-:-:-:1      LOP.AND readIs, readIs, 0x7ff;

// Expand back out to undo our bank conflict avoiding stride
--:-:-:-:1      SHL readIs, readIs, 1;

// Div by 4 here collapses k stride
// writeCs = ((readKs / 4) * 64 + readNs) / 2;
--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
--:-:-:-:1      SHR.U32 writeCs, writeCs, 1;


// readCs  = 2 * (tidOX + (tidOY * 64))
--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
--:-:-:-:1      SHL    readCs, readCs, 1;

// n = blkI*64 + tidOX;
04:-:-:-:1      ISCADD n, blkI, tidOX, 6;

// Mul by 4 here expands k stride back out
// Mul by 2 again to undo the bank conflict avoiding stride
// k = blkF*64 + tidOY * 8
--:-:-:-:1      SHL    tidOY,   tidOY, 3;
08:-:-:-:1      ISCADD k, blkF, tidOY, 6;

// o = k*MPQN + m*PQN + p*QN + q*N + n
01:-:-:-:1      XMAD      to, q, param_N,    n;
--:-:-:-:1      XMAD.LO2C to, p, param_QN,   to;
--:-:-:-:1      XMAD.LO2C to, m, param_PQN,  to;
--:-:-:-:1      XMAD.LO2C to, k, param_MPQN, to;
--:-:-:-:1      LEA      Out0.CC, to, param_O[0],     1;
--:-:-:-:0      LEA.HI.X Out1,    to, param_O[1], RZ, 1;

--:-:-:-:1      MOV  MPQN1, param_MPQN;
--:-:-:-:1      SHL  MPQN1, MPQN1, 1;

--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, PT; // n +  0 < N

--:-:-:-:1      MOV alpha, param_alpha;

</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .= sprintf(
            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n\n",
            ($y) x 8);

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

--:-:-:-:5      EXIT;

STORE_C:

--:-:-:-:2      ISETP.LT.AND P1, PT, k, param_K, P0; // k < K && n < N
--:-:-:-:0      IADD k, k, 1;

--:-:-:-:1      F2F.F16.F32 c0, c0;
--:-:1:-:1      F2F.F16.F32 c1, c1;
--:-:-:-:1      F2F.F16.F32 c2, c2;
--:-:2:-:1      F2F.F16.F32 c3, c3;
--:-:-:-:1      F2F.F16.F32 c4, c4;
--:-:3:-:1      F2F.F16.F32 c5, c5;
--:-:-:-:1      F2F.F16.F32 c6, c6;
--:-:4:-:1      F2F.F16.F32 c7, c7;

// Pack 2 16 bit values into 32 bit words
01:-:-:-:2      BFI cs0, c1, 0x1010, c0;
02:-:-:-:2      BFI cs1, c3, 0x1010, c2;
04:-:-:-:2      BFI cs2, c5, 0x1010, c4;
08:-:-:-:0      BFI cs3, c7, 0x1010, c6;

// Undo the stride in the X dim (items spaced by 32 are actually spaced 4)
--:-:-:-:4      STS.64 [writeCs + 2x<0>], cs0;
--:-:-:-:1      STS.64 [writeCs + 2x<4>], cs2;
--:-:1:-:2      LDS.U.128 cs0, [readCs];

// Store results back to global
01:1:-:-:2  @P1 STG.E.CG.128 [Out], cs0;

01:-:-:-:6      IADD   Out0.CC, Out0, MPQN1;
--:-:-:-:0      IADD.X Out1,    Out1, RZ;

--:-:-:-:5      RET;

